# Install packages if needed
# install.packages(c("knitr", "dplyr", "survival", "ggplot2", "here", "tibble"))
library(knitr)
library(dplyr)
library(survival)
library(ggplot2)
library(tibble)
# devtools::install_github("zabore/ezfun")
ezfun::set_ccf_palette("contrast")
# install.packages(c("lubridate", "ggsurvfit", "gtsummary", "tidycmprsk"))
library(lubridate)
library(ggsurvfit)
library(gtsummary)
library(tidycmprsk)
# devtools::install_github("zabore/condsurv")
library(condsurv)
library(psych)
library(kableExtra)
#install.packages("xfun")


############
load(file = "C:\\Users\\olive\\OneDrive\\Desktop\\Adv Suv Analysis\\df_exercise.RData")
head(dfex)


#create event variable
dfex$event<-ifelse(dfex$censor==1, 0, 1)
head(dfex)


##################################################################################
#### 1. Explore Group Differences  ####
#*******************************************************************************

# Create a "survival object" using Surv from package "survival"
surv_obj <- Surv(dfex$time, dfex$event)
# double check that the Surv is reading the file correctly:
Surv(dfex$time, dfex$event)[1:10]


# The survfit() function uses the Kaplan-Meier method to generate key stats:
# I will use this to generate object "km_fit" that holds stats I will need
# Introduce a categorical variable, so we will obtain stats for different groups (strata)
km_fit <- survfit(Surv(time, event) ~ male, data = dfex)
summary(km_fit)
# Display respective median lifetimes
summary(km_fit)$table


# I will now use "survminer" to create nicer plots
library(survminer)

# Create a customized plot
g1<-ggsurvplot(
  km_fit,
  data = dfex,
  conf.int = FALSE,                     # DO not Show 95% confidence intervals
  palette = c("#E7B800", "#2E9FDF"),   # Custom colors for lines
  size = 1.2,                          # Change line thickness
  xlab = "Days",
  ylab = "Survival probability",
  legend.labs = c("Female", "Male"),   # Adjust labels as appropriate
  ggtheme = theme_bw(),                 # Clean background
  title = "Kaplan-Meier Survival by Sex"
  )
g1

# Create a plot of the cumulative hazard function
g2<-ggsurvplot(
  km_fit,
  data = dfex,
  fun = "cumhaz",                         # Plot cumulative hazard
  conf.int = FALSE,                        # DO not Show 95% confidence intervals
  palette = c("#E7B800", "#2E9FDF"),      # Custom colors for lines
  size = 1.2,                             # Change line thickness
  xlab = "Days",
  ylab = "Cumulative hazard",
  legend.labs = c("Female", "Male"),      # Adjust as appropriate
  ggtheme = theme_bw(),
  title = "Cumulative Hazard by Sex"
)
g2

# Create a plot of the kernel-smoothed hazard function
# Install if needed
#install.packages("muhaz")
library(muhaz)

haz_male <- muhaz(dfex$time[dfex$male == 1], 
                  dfex$event[dfex$male == 1])
haz_female <- muhaz(dfex$time[dfex$male == 0], 
                    dfex$event[dfex$male == 0])
# Combine for ggplot
df_male <- data.frame(time = haz_male$est.grid, 
                      hazard = haz_male$haz.est, 
                      sex = "Male")
df_female <- data.frame(time = haz_female$est.grid, 
                        hazard = haz_female$haz.est, 
                        sex = "Female")
haz_df <- bind_rows(df_male, df_female)
# Plot with custom colors
ggplot(haz_df, aes(x = time, y = hazard, color = sex)) +
  geom_line(size = 1.2) +
  scale_color_manual(values = c("Female" = "#E7B800", "Male" = "#2E9FDF")) +
  labs(x = "Days", y = "Kernel-smoothed hazard", 
       title = "Kernel-smoothed Hazard by Sex") +
  theme_bw()


# I will use a global bandwidth how, i.e. 60 days
# For males 
haz_male <- muhaz(
  dfex$time[dfex$male == 1],
  dfex$event[dfex$male == 1],
  bw.method = "global",
  bw.grid = 60
)
# For females 
haz_female <- muhaz(
  dfex$time[dfex$male == 0],
  dfex$event[dfex$male == 0],
  bw.method = "global",
  bw.grid = 60
)
# Combine for ggplot
df_male <- data.frame(time = haz_male$est.grid, hazard = haz_male$haz.est, sex = "Male")
df_female <- data.frame(time = haz_female$est.grid, hazard = haz_female$haz.est, sex = "Female")
haz_df <- bind_rows(df_male, df_female)
# Plot with custom colors
g3<-ggplot(haz_df, aes(x = time, y = hazard, color = sex)) +
  geom_line(size = 1.2) +
  scale_color_manual(values = c("Female" = "#E7B800", "Male" = "#2E9FDF")) +
  labs(x = "Days", y = "Kernel-smoothed hazard", title = "Kernel-smoothed Hazard by Sex (bw = 60 days)") +
  theme_bw()
g3

# Make sure the two graphs use the same range and break values for the x  axis: 
# Define  desired x-axis range and breaks
x_limits <- c(0, 365)  # adjust as needed for your data
x_breaks <- seq(0, 365, by = 30.42)  # adjust interval as needed
# Apply to both plots
g1$plot <- g1$plot + scale_x_continuous(limits = x_limits, breaks = x_breaks)
g2$plot <- g2$plot + scale_x_continuous(limits = x_limits, breaks = x_breaks)
g3 <- g3 + scale_x_continuous(limits = x_limits, breaks = x_breaks)
# remove the legends
g1$plot <- g1$plot + theme(legend.position = "none")
g2$plot <- g2$plot + theme(legend.position = "none")
# Combine with ggarrange
library(ggpubr)
ggarrange(g1$plot, g2$plot, g3, ncol = 1, align = "v")

##################################################################################
#### 2. Explore Group Differences using Log Cumulative Hazard Function  ####
#*******************************************************************************

#create a data frame that includes t the negative log survivor funct. and their logs by group
s <- summary(km_fit)
###
df_nlsf <- data.frame(
  time = s$time,
  surv = s$surv,
  neg_log_surv = -log(s$surv),
  log_nls = log(-log(s$surv)),
    Group = s$strata
)

# Plot the neg.log.survivor
library(ggplot2)

gH<-ggplot(df_nlsf, aes(x = time, y = neg_log_surv, color = Group)) +
  geom_step(size = 1.2) +
  labs(x = "Time", y = "H(t)", 
       title = "Negative Log Survivor Function by Sex") +
  scale_color_manual(values = c("male=0" = "#E7B800", "male=1" = "#2E9FDF"),
                     labels = c("male=0" = "Female", "male=1" = "Male")) +
  theme_bw()

gH


# Plot the log of neg.log.survivor
library(ggplot2)

glogH<-ggplot(df_nlsf, aes(x = time, y = log_nls, color = Group)) +
  geom_step(size = 1.2) +
  labs(x = "Time", y = "Log H(t)", 
       title = "Log of -Log Survivor Function by Sex") +
  scale_color_manual(values = c("male=0" = "#E7B800", "male=1" = "#2E9FDF"),
                     labels = c("male=0" = "Female", "male=1" = "Male")) +
  theme_bw()

glogH

##################################################################################
#### 3. Cox Regression Model  ####
#*******************************************************************************

coxph(Surv(time, event) ~ male, data = dfex)

# Recode SES into two dummy coded variables that compare against more deprived
dfex$highses<-ifelse(dfex$ses==1, 1, 0)
dfex$modses<-ifelse(dfex$ses==2, 1, 0)

# include these predictors:
m2fit<-coxph(Surv(time, event) ~ male+highses+modses, data = dfex)
summary(m2fit)
#Put them in a table
library(gtsummary)
# Create a summary table with  HRs  and 95% CIs
summary_table <- tbl_regression(m2fit, exponentiate = TRUE)
summary_table

##################################################################################
#### 4. Cox Regression Model - Ties  ####
#*******************************************************************************
coxph(Surv(time, event) ~ male+highses+modses, ties="breslow", data = dfex)


##################################################################################
#### 5. Cox Regression Model: Recover baseline functions and compare prototypical groups  ####
#*******************************************************************************

############################
#### "BAseline" functions ####
#***************************

# Cox model 
coxm2<-coxph(Surv(time, event) ~ male+highses+modses, data = dfex)
coxm2
#use the model to predict survival and cumulative hazard rates for  baseline 
#and average case and plotted the results 
femalehigh <- data.frame(male=0, highses=1, modses=0) 
s.femalehigh <- survfit(coxm2, newdata = femalehigh) 
# now estimate for male deprived
maledepr<-data.frame(male=1, highses=0, modses=0) 
s.maledepr <- survfit(coxm2, newdata = maledepr) 
# now estimate for male high
malehigh<-data.frame(male=1, highses=1, modses=0) 
s.malehigh <- survfit(coxm2, newdata = malehigh) 
# now estimate for female deprive
femaledepr<-data.frame(male=0, highses=0, modses=0) 
s.femaledepr <- survfit(coxm2, newdata = femaledepr) 


library(patchwork)

# Extract baseline and average survival and cumulative hazard
fh <- data.frame(
  time = s.femalehigh$time,
  surv = s.femalehigh$surv,
  cumhaz = s.femalehigh$cumhaz,
  group = "Female - High SES"
)

mh <- data.frame(
  time = s.malehigh$time,
  surv = s.malehigh$surv,
  cumhaz = s.malehigh$cumhaz,
  group = "Male - High SES"
)

fd <- data.frame(
  time = s.femaledepr$time,
  surv = s.femaledepr$surv,
  cumhaz = s.femaledepr$cumhaz,
  group = "Female - Deprived SES"
)

md <- data.frame(
  time = s.maledepr$time,
  surv = s.maledepr$surv,
  cumhaz = s.maledepr$cumhaz,
  group = "Male - Deprived SES"
)

# Combine into one data frame for plotting
combineds <- rbind(fh, mh, md, fd)

# Define  desired x-axis range and breaks
x_limits <- c(0, 365)  # adjust as needed for your data
x_breaks <- seq(0, 365, by = 91.26)  # adjust interval as needed

# Plot survivor functions for all groups
p_surv <- ggplot(combineds, aes(x = time, y = surv, color = group)) +
  geom_step(size = 1.2, alpha = 0.9) +
  scale_color_viridis_d(option = "D") +
  scale_x_continuous(breaks = x_breaks, limits = x_limits) +
  labs(
    title = "Survivor Functions by Group",
    x = "Time",
    y = "Survival Probability",
    color = "Group"
  ) +
  theme_minimal(base_size = 14) + 
  theme(legend.position = c(0, 0), legend.justification = c(0, 0))


print(p_surv)


# Plot Cum.Hazard functions for all groups
p_cumhaz <- ggplot(combineds, aes(x = time, y = cumhaz, color = group)) +
  geom_line(size = 1.2, alpha = 0.9) +
  scale_color_viridis_d(option = "D") +
  scale_x_continuous(breaks = x_breaks, limits = x_limits) +
  labs(
    title = "Cum.Hardzd by Group",
    x = "Days",
    y = "Cum.Hazard",
    color = "Group"
  ) +
  theme_minimal(base_size = 14) +
   theme(legend.position = "none")

print(p_cumhaz)


#combine plots
combined_plots <- p_surv | p_cumhaz
combined_plots